R Sys.Date()First designate the libraries and the cells that were resampled.
libs <- list("original",
"standard",
"biotinylated",
"phosphorothioate")
cells <- list(
standard = c(
"Cell_4Bone_63_70",
"Cell_4Bone_27_61",
"Cell_4Bone_34_48",
"Cell_1Tumor_0_55",
"Cell_1Tumor_7_50",
"Cell_4Bone_30_23",
"Cell_4Bone_61_22",
"Cell_4Bone_66_9",
"Cell_3Brain_47_38",
"Cell_3Brain_14_46"
),
biotinylated = c(
"Cell_4Bone_63_70",
"Cell_4Bone_27_61",
"Cell_4Bone_34_48",
"Cell_1Tumor_0_55",
"Cell_1Tumor_7_50",
"Cell_3Brain_47_38",
"Cell_3Brain_14_46",
"Cell_4Bone_28_54",
"Cell_4Bone_26_20",
"Cell_4Bone_60_43"
),
phosphorothioate = c(
"Cell_1Tumor_7_50",
"Cell_3Brain_47_38",
"Cell_4Bone_28_54",
"Cell_4Bone_26_20",
"Cell_4Bone_60_43"
)
)
bc_metadat <- read_tsv(file.path(data_dir,
"original",
"fastq",
"original",
"well_data_barcode_keys.txt"),
col_names = c("cell_id", "barcode_10x"))
## original library to compare against
reflib <- "original"
resampled_libs <- c(
"standard",
"biotinylated",
"phosphorothioate")
## reference resampled lib for resampled vs control plots
resampled_lib <- "phosphorothioate"
## pretty name for libraries
lib_names = c(
original = "Original Library",
standard = "PCR",
biotinylated = "PCR +\n5' Biotin",
phosphorothioate = "PCR +\n5' Biotin +\n3' Phosphorothioates"
)
## pretty names for cells
cell_names <- list(
standard = c(
"Cell_4Bone_63_70" = "Bone Met Cell #4",
"Cell_4Bone_27_61" = "Bone Met Cell #5",
"Cell_4Bone_34_48" = "Bone Met Cell #6",
"Cell_1Tumor_0_55" = "Primary Tumor Cell #2",
"Cell_1Tumor_7_50" = "Primary Tumor Cell #1",
"Cell_4Bone_30_23" = "Bone Met Cell #7",
"Cell_4Bone_61_22" = "Bone Met Cell #8",
"Cell_4Bone_66_9" = "Bone Met Cell #9",
"Cell_3Brain_47_38" = "Brain Met Cell #1",
"Cell_3Brain_14_46" = "Brain Met Cell #2"
),
biotinylated = c(
"Cell_4Bone_63_70" = "Bone Met Cell #4",
"Cell_4Bone_27_61" = "Bone Met Cell #5",
"Cell_4Bone_34_48" = "Bone Met Cell #6",
"Cell_1Tumor_0_55" = "Primary Tumor Cell #2",
"Cell_1Tumor_7_50" = "Primary Tumor Cell #1",
"Cell_3Brain_47_38" = "Brain Met Cell #1",
"Cell_3Brain_14_46" = "Brain Met Cell #2",
"Cell_4Bone_28_54" = "Bone Met Cell #1",
"Cell_4Bone_26_20" = "Bone Met Cell #2",
"Cell_4Bone_60_43" = "Bone Met Cell #3"
),
phosphorothioate = c(
"Cell_1Tumor_7_50" = "Primary Tumor Cell #1",
"Cell_3Brain_47_38" = "Brain Met Cell #1",
"Cell_4Bone_28_54" = "Bone Met Cell #1",
"Cell_4Bone_26_20" = "Bone Met Cell #2",
"Cell_4Bone_60_43" = "Bone Met Cell #3"
)
)Load and organize a table for each library of read counts per cell per gene, and a table of umi counts per cell per gene.
umis_to_genes <- function(umipath, cells_to_exclude = c("Cell_unmatched")){
umis <- read_tsv(umipath,
col_names = c("barcode_10x",
"umi_molecule",
"count")) %>%
filter(barcode_10x != cells_to_exclude)
mol_fields <- str_count(umis$umi_molecule[1], "::")
if(mol_fields == 2 ){
umis <- separate(umis, umi_molecule,
into = c("seq", "genome", "gene"),
sep = "::") %>%
mutate(gene = str_c(genome, "::", gene))
} else if (mol_fields == 1){
umis <- separate(umis, umi_molecule,
into = c("seq", "gene"),
sep = "::")
} else {
stop("separator :: missing from umi_molecule field")
}
reads <- select(umis,
barcode_10x,
gene,
count)
reads <- group_by(reads,
barcode_10x, gene) %>%
summarize(counts = sum(count))
reads <- spread(reads, barcode_10x, counts,
fill = 0L)
reads
}
## read in umigroups flat file with read counts per umi per gene per cell
## expand out to a read count matrix
umipaths <- file.path(data_dir,
libs,
"umis",
"umigroups.txt.gz")
read_dat <- map(umipaths,
~umis_to_genes(.))
names(read_dat) <- libs
## read in umi_tools count table with umi counts per gene per cell
umi_dat <- map(libs,
~read_tsv(file.path(data_dir,
.x,
"dgematrix",
"dge_matrix.txt")) %>%
select(-matches("Cell_unmatched")))
names(umi_dat) <- libs
all_cells <- c(list(original = unlist(cells) %>% unname()),
cells)
cell_obj_mdata <- map(all_cells,
~mutate(bc_metadat,
resampled = ifelse(cell_id %in% .x,
TRUE,
FALSE)))Next organize these tables into simple classes called resampled-sets to keep track of each experiment’s relavant raw, processed, and meta data.
#' simple class to hold info for each experiment
create_sc_obj <- function(umi_df,
read_df,
cell_mdata_df){
x <- list()
class(x) <- "resampled-set"
x$umis <- umi_df
x$reads <- read_df
x$meta_dat <- cell_mdata_df
return(x)
}
sc_objs <- list(umi_dat, read_dat, cell_obj_mdata)
sc_objs <- pmap(sc_objs, create_sc_obj)
rm(umi_dat)
rm(read_dat)Next perform basic processing. 1) generate separate objects to store sparse matrices of umi and read counts. 2) normalize read and umi count data by total library size (sum of all read or umi counts for all cells in the experiment) and report as Reads per million or UMIs per million. 3) Compute per cell metrics (read and umi counts, sequencing saturation)
tidy_to_matrix <- function(df){
df <- as.data.frame(df)
rownames(df) <- df[, 1]
df[, 1] <- NULL
mat <- as.matrix(df)
mat <- as(mat, "sparseMatrix")
return(mat)
}
#' keep both tidy and matrix objs
generate_matrices <- function(sc_obj){
sc_obj$umi_matrix <- tidy_to_matrix(sc_obj$umis)
sc_obj$read_matrix <- tidy_to_matrix(sc_obj$reads)
sc_obj
}
#' normalize by library size (Reads per Million)
norm_libsize <- function(sc_obj){
sc_obj$norm_umi <- 1e6 * sweep(sc_obj$umi_matrix, 2,
sum(as.vector(sc_obj$umi_matrix)), "/")
sc_obj$norm_reads <- 1e6 * sweep(sc_obj$read_matrix, 2,
sum(as.vector(sc_obj$read_matrix)), "/")
sc_obj
}
add_metadata <- function(sc_obj, dat){
if (is.vector(dat)){
new_colname <- deparse(substitute(dat))
df <- data_frame(!!new_colname := dat)
df[[new_colname]] <- dat
df[["cell_id"]] <- names(dat)
sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
df,
by = "cell_id")
} else if (is.data.frame(dat)) {
sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
dat,
by = "cell_id")
}
sc_obj
}
compute_summaries <- function(sc_obj){
## raw counts
total_umis <- colSums(sc_obj$umi_matrix)
names(total_umis) <- colnames(sc_obj$umi_matrix)
total_reads <- colSums(sc_obj$read_matrix)
names(total_reads) <- colnames(sc_obj$read_matrix)
## norm counts
norm_total_umis <- colSums(sc_obj$norm_umi)
names(norm_total_umis) <- colnames(sc_obj$norm_umi)
norm_total_reads <- colSums(sc_obj$norm_reads)
names(norm_total_reads) <- colnames(sc_obj$norm_reads)
sc_obj <- add_metadata(sc_obj, total_umis)
sc_obj <- add_metadata(sc_obj, total_reads)
sc_obj <- add_metadata(sc_obj, norm_total_umis)
sc_obj <- add_metadata(sc_obj, norm_total_reads)
## compute cDNA duplication rate
sc_obj$meta_dat$cDNA_duplication <- 1 - (sc_obj$meta_dat$total_umis /
sc_obj$meta_dat$total_reads)
sc_obj
}
sc_objs <- map(sc_objs, generate_matrices)
sc_objs <- map(sc_objs, norm_libsize)
sc_objs <- map(sc_objs, compute_summaries)Compute enrichment of reads/umis over the original library.
sc_objs <- map(sc_objs,
function(sub_dat){
og_counts <- select(sc_objs[[reflib]]$meta_dat,
og_total_reads = total_reads,
og_total_umis = total_umis,
og_norm_total_umis = norm_total_umis,
og_norm_total_reads = norm_total_reads,
og_cDNA_duplication = cDNA_duplication,
cell_id)
sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
og_counts,
by = "cell_id")
sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
read_proportion = log2( total_reads / og_total_reads),
umi_proportion = log2( total_umis / og_total_umis),
norm_read_proportion = log2( norm_total_reads /
og_norm_total_reads),
norm_umi_proportion = log2( norm_total_umis /
og_norm_total_umis))
sub_dat
})sc_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library, levels = libs)) %>%
arrange(resampled)
plt <- ggplot(sc_metadat, aes(total_umis, cDNA_duplication)) +
geom_point(aes(color = resampled), size = 0.5) +
labs(x = expression(paste("# of UMIs")),
y = "Sequencing saturation") +
scale_x_log10(labels = scales::comma) +
scale_color_manual(values = color_palette) +
guides(colour = guide_legend(override.aes = list(size = 5))) +
facet_wrap(~library,
labeller = labeller(library = lib_names)) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
plot.margin = unit(c(5.5, 20.5, 5.5, 5.5),
"points"))
pltNote the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.
save_plot("cDNA_duplication.pdf", plt,
base_height = 5.5,
base_aspect_ratio = 0.85)global_plot_theme <- theme(
legend.position = "top",
legend.text = element_text(size = 10),
strip.text = element_text(size = 8))
resampled_metadat <- filter(sc_metadat,
library != "original") %>%
mutate(library = factor(library,
levels = c(
"standard",
"biotinylated",
"phosphorothioate")))
unnorm_plt <- ggplot(resampled_metadat,
aes(og_total_reads / 3, total_reads / 3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
facet_wrap(~library, nrow = 1) +
coord_fixed() +
xlab("original library\nreads count (Thousands)") +
ylab("resampled library\nreads count (Thousands)") +
# ggtitle("Raw reads associated with each cell barcode") +
scale_colour_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = .5) +
global_plot_theme
norm_plt <- ggplot(resampled_metadat, aes(og_norm_total_reads / 1e3, norm_total_reads / 1e3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
facet_wrap(~library, nrow = 1) +
xlab("original library\nRPM (Thousands)") +
ylab("resampled library\nRPM (Thousands)") +
scale_colour_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = 0.5) +
theme(aspect.ratio = 1) +
global_plot_theme
unnorm_umi_plt <- ggplot(resampled_metadat,
aes(og_total_umis / 1e3, total_umis / 1e3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
coord_fixed() +
facet_wrap(~library, nrow = 1) +
xlab("original library\nUMI count (Thousands)") +
ylab("resampled library\nUMI count (Thousands)") +
scale_colour_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = .5) +
global_plot_theme
norm_umi_plt <- ggplot(resampled_metadat,
aes(og_norm_total_umis / 1e3, norm_total_umis / 1e3, colour = resampled)) +
geom_point(size = 0.5) +
geom_abline(slope = 1) +
facet_wrap(~library, nrow = 1) +
xlab("original library\nUMI normalized RPM (Thousands)") +
ylab("resampled library\nUMIs per Million (Thousands)") +
scale_colour_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 10, line_size = 0.5) +
theme(aspect.ratio = 1) +
global_plot_theme
plt <- plot_grid(unnorm_plt, norm_plt, unnorm_umi_plt, norm_umi_plt,
labels = "AUTO",
align = 'hv')
pltsave_plot("reads_per_barcode_scatterplots.pdf", plt, base_width = 8 )umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
function(x){
ggplot(x,
aes(og_total_umis, norm_umi_proportion, colour = resampled)) +
geom_point(size = 0.5) +
geom_hline(aes(yintercept = 0),
linetype ="dashed",
color = "darkgrey") +
scale_x_continuous(labels = scales::comma) +
labs(x = "Abundance in original\nlibrary (UMIs)",
y = expression(paste( "Log"[2], " Normalized UMIs ",
frac("resampled", "original")))) +
scale_colour_manual(name = "Resampled:", values = color_palette) +
guides(colour = guide_legend(override.aes = list(size = 5))) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
legend.text = element_text(size = 12))})
plt <- plot_grid(plotlist = umi_plots, nrow = 1)
save_plot("umi_ratio_MA.pdf", plt,
ncol = 3, nrow = 1,
base_width = 5,
base_height = 5)ggplot(resampled_metadat, aes(resampled,
read_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " Reads ", frac("resampled", "original")))) +
scale_fill_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
ggplot(resampled_metadat, aes(resampled,
umi_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " UMIs ", frac("resampled", "original")))) +
scale_fill_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
ggplot(resampled_metadat, aes(resampled,
norm_read_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " normalized Reads ", frac("resampled", "original")))) +
scale_fill_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("norm_reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
ggplot(resampled_metadat, aes(resampled,
norm_umi_proportion, fill = resampled)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, nrow = 1) +
xlab("Selected for Reamplification") +
ylab(expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
scale_fill_manual(name = "Resampled:",
values = color_palette) +
theme_cowplot(font_size = 16, line_size = 0.5) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "top",
legend.text = element_text(size = 12)
)ggsave("norm_umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)dat <- group_by(resampled_metadat, library) %>%
filter(library != "original") %>%
mutate(total_new = sum(total_reads, na.rm = T),
total_old = sum(og_total_reads, na.rm = T))
dat_group <- group_by(dat, library, resampled) %>%
summarize(total_new = sum(total_reads,
na.rm = T) / unique(total_new),
total_old = sum(og_total_reads,
na.rm = T) / unique(total_old)) %>%
gather(lib, percent_lib, -library, -resampled ) %>%
mutate(lib = factor(lib, levels = c("total_old", "total_new"),
labels = c("Original\nLibrary",
"Resampled\nLibrary")),
resampled = factor(resampled, levels = c( "TRUE", "FALSE"))) %>%
arrange(resampled)
plt <- ggplot(dat_group, aes(lib, percent_lib, fill = resampled)) +
geom_bar(stat = "identity") +
ylab("Fraction of Reads Assigned") +
scale_fill_manual(name = "Resampled:",
values = rev(color_palette)) +
facet_wrap(~library, labeller = labeller(library = lib_names)) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
legend.position = "top",
legend.text = element_text(size = 16),
strip.text = element_text(margin = margin(2, 2, 2, 2, "mm"))
)
save_plot("proportion_reads_all_barcode_barplot.pdf",
plt,
base_height = 7,
base_aspect_ratio = 1)
dat_group %>%
rename(Method = library) %>%
spread(lib, percent_lib) %>%
mutate(`Targeted Library Read Fold-Enrichment` = `Resampled\nLibrary` / `Original\nLibrary`) %>%
filter(resampled == T) %>%
select(Method, `Targeted Library Read Fold-Enrichment`)add_species_counts <- function(sc_obj,
mouse_gene_prefix = "mm38::",
human_gene_prefix = "hg38::"){
## get mouse and human reads
g_ids <- rownames(sc_obj$read_matrix)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
mouse_reads = colSums(sc_obj$read_matrix[mouse_ids, ])
human_reads = colSums(sc_obj$read_matrix[human_ids, ])
## get mouse and human UMIs
g_ids <- rownames(sc_obj$umi_matrix)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
mouse_umis = colSums(sc_obj$umi_matrix[mouse_ids, ])
human_umis = colSums(sc_obj$umi_matrix[human_ids, ])
## get norm counts for reads
g_ids <- rownames(sc_obj$norm_reads)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
norm_human_reads <- colSums(sc_obj$norm_reads[human_ids, ])
norm_mouse_reads <- colSums(sc_obj$norm_reads[mouse_ids, ])
## get norm counts for umis
g_ids <- rownames(sc_obj$norm_umi)
mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
norm_human_umis <- colSums(sc_obj$norm_umi[human_ids, ])
norm_mouse_umis <- colSums(sc_obj$norm_umi[mouse_ids, ])
sc_obj <- add_metadata(sc_obj, human_reads)
sc_obj <- add_metadata(sc_obj, mouse_reads)
sc_obj <- add_metadata(sc_obj, human_umis)
sc_obj <- add_metadata(sc_obj, mouse_umis)
sc_obj <- add_metadata(sc_obj, norm_human_reads)
sc_obj <- add_metadata(sc_obj, norm_mouse_reads)
sc_obj <- add_metadata(sc_obj, norm_human_umis)
sc_obj <- add_metadata(sc_obj, norm_mouse_umis)
## make sure mouse + human == total
stopifnot(all(sc_obj$meta_dat$total_reads == sc_obj$meta_dat$mouse_reads +
sc_obj$meta_dat$human_reads, na.rm = T))
stopifnot(all(sc_obj$meta_dat$total_umis == sc_obj$meta_dat$mouse_umis +
sc_obj$meta_dat$human_umis, na.rm = T))
## check floating point totals
tol <- 1e-5
reads_check <- all(abs(sc_obj$meta_dat$norm_total_reads - (sc_obj$meta_dat$norm_mouse_reads +
sc_obj$meta_dat$norm_human_reads)) <= tol, na.rm = T)
stopifnot(reads_check)
umis_check <- all(abs(sc_obj$meta_dat$norm_total_umis - (sc_obj$meta_dat$norm_mouse_umis +
sc_obj$meta_dat$norm_human_umis)) <= tol, na.rm = T)
stopifnot(umis_check)
## calculate species purity (human / human + mouse)
sc_obj$meta_dat <- mutate(sc_obj$meta_dat,
purity_reads = human_reads / (human_reads + mouse_reads),
purity_umis = human_umis / (human_umis + mouse_umis))
sc_obj
}
sc_objs <- map(sc_objs, add_species_counts)## add in metadat columns for original mouse and original human data
sc_objs <- map(sc_objs,
function(sub_dat){
og_dat <- select(sc_objs$original$meta_dat,
cell_id,
str_subset(colnames(sc_objs$original$meta_dat),
"human|mouse|purity"))
cols <- colnames(og_dat)
new_cols <- c("cell_id", str_c("og_", cols[2:length(cols)]))
colnames(og_dat) <- new_cols
sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
og_dat,
by = "cell_id")
sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
norm_human_read_proportion = log2( norm_human_reads /
og_norm_human_reads),
norm_human_umi_proportion = log2( norm_human_umis /
og_norm_human_umis),
norm_mouse_read_proportion = log2( norm_mouse_reads /
og_norm_mouse_reads),
norm_mouse_umi_proportion = log2( norm_mouse_umis /
og_norm_mouse_umis))
sub_dat
})
resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library,
levels = c(
"original",
"standard",
"biotinylated",
"phosphorothioate"))) %>%
arrange(resampled)read_plots <- map(split(resampled_metadat, resampled_metadat$library),
function(x){
ggplot(x,
aes(human_reads, mouse_reads,
colour = resampled)) +
geom_point(size = 0.5) +
facet_wrap(~library, nrow = 1, scales = "free",
labeller = labeller(library = lib_names)) +
xlab("Human reads") +
ylab("Mouse reads") +
scale_colour_manual(name = "Resampled:",
values = color_palette) +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
guides(colour = guide_legend(override.aes = list(size = 5))) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
legend.text = element_text(size = 12))
})
plt <- plot_grid(plotlist = read_plots,
nrow = 1)
save_plot("read_scatterplot_human_mouse.pdf",
plt, ncol = length(libs), nrow = 1,
base_height = 5, base_aspect_ratio = 1)
umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
function(x){
ggplot(x,
aes(human_umis,
mouse_umis,
colour = resampled)) +
geom_point(size = 0.5) +
facet_wrap(~library, nrow = 1, scales = "free",
labeller = labeller(library = lib_names)) +
xlab("Human UMIs") +
ylab("Mouse UMIs") +
scale_colour_manual(name = "Resampled:",
values = color_palette) +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
guides(colour = guide_legend(override.aes = list(size = 5))) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(legend.position = "top",
legend.text = element_text(size = 12))
})
plt <- plot_grid(plotlist = umi_plots,
nrow = 1)
save_plot("umi_scatterplot_human_mouse.pdf",
plt, ncol = length(libs), nrow = 1,
base_height = 5, base_aspect_ratio = 1)## compute per gene or per gene/umi combo enrichment
detected_molecules <- function(sc_obj, molecule = "gene"){
umis <- sc_obj$umi_matrix
if (molecule == "gene"){
human_mat <- umis[str_detect(rownames(umis), "hg38::"), ]
mouse_mat <- umis[str_detect(rownames(umis), "mm38::"), ]
n_human_genes <- colSums(human_mat > 0)
n_mouse_genes <- colSums(mouse_mat > 0)
out_mdat <- data_frame(cell_id = colnames(umis),
n_human_genes = n_human_genes,
n_mouse_genes = n_mouse_genes)
sc_obj <- add_metadata(sc_obj, out_mdat)
}
}
sc_objs <- map(sc_objs, ~detected_molecules(.x))resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library,
levels = c(
"original",
"standard",
"biotinylated",
"phosphorothioate")))
og_genes <- filter(resampled_metadat,
library == "original") %>%
dplyr::select(cell_id,
og_human_genes = n_human_genes,
og_mouse_genes = n_mouse_genes)
resampled_metadat <- left_join(resampled_metadat,
og_genes,
by = "cell_id")
ggplot(resampled_metadat, aes(og_human_genes,
n_human_genes, colour = resampled)) +
geom_point() +
ylab("resampled_genes") +
xlab("original_genes") +
scale_colour_manual(name = "Resampled:", values = color_palette) +
facet_wrap(~library, nrow = 1) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(
legend.position = "top",
legend.text = element_text(size = 16)
)ggplot(resampled_metadat, aes(og_mouse_genes,
n_mouse_genes,
colour = resampled)) +
geom_point() +
ylab("resampled_genes") +
xlab("original_genes") +
scale_colour_manual(name = "Resampled:", values = color_palette) +
facet_wrap(~library, nrow = 1) +
theme_cowplot(font_size = 16, line_size = 1) +
theme(
legend.position = "top",
legend.text = element_text(size = 16)
)calc_gene_sensitivity <- function(sc_obj,
type = "umi",
mouse_gene_prefix = "mm38::",
human_gene_prefix = "hg38::"){
if (type == "umi"){
count_matrix <- sc_obj$umi_matrix
} else {
count_matrix <- sc_obj$read_matrix
}
# generate list named with barcode of each detected gene and
# respective read/umi count
genes_detected <- apply(count_matrix, 2, function(x) x[x > 0])
sc_obj$genes_detected <- genes_detected
sc_obj
}
sc_objs <- map(sc_objs, calc_gene_sensitivity)sc_objs <- map(sc_objs,
function(x){
og_genes <- sc_objs$original$genes_detected
sub_genes <- x$genes_detected
# subset list of cell barcodes to the same as the og experiment
# and also reorders the barcodes to match
sub_genes <- sub_genes[names(og_genes)]
if(length(sub_genes) != length(og_genes)){
stop("barcode lengths not the same")
}
shared_genes <- map2(sub_genes,
og_genes,
~intersect(names(.x),
names(.y)))
new_genes <- map2(sub_genes,
og_genes,
~setdiff(names(.x),
names(.y)))
not_recovered_genes <- map2(og_genes,
sub_genes,
~setdiff(names(.x),
names(.y)))
x$shared_genes <- shared_genes
x$new_genes <- new_genes
x$not_recovered_genes <- not_recovered_genes
return(x)
})
## add gene recovery info to meta data table
sc_objs <- map(sc_objs,
function(x){
shared_genes <- map2_dfr(x$shared_genes,
names(x$shared_genes),
function(x, y){
mouse <- sum(str_detect(x, "^mm38::")) ;
human <- sum(str_detect(x, "^hg38::")) ;
data_frame(cell_id = y,
mouse_shared_genes = mouse,
human_shared_genes = human,
shared_genes = mouse + human)
})
not_recovered_genes <- map2_dfr(x$not_recovered_genes,
names(x$not_recovered_genes),
function(x, y){
mouse <- sum(str_detect(x, "^mm38::")) ;
human <- sum(str_detect(x, "^hg38::")) ;
data_frame(cell_id = y,
mouse_not_recovered_genes = mouse,
human_not_recovered_genes = human,
not_recovered_genes = mouse + human)
})
new_genes <- map2_dfr(x$new_genes,
names(x$new_genes),
function(x, y){
mouse <- sum(str_detect(x, "^mm38::")) ;
human <- sum(str_detect(x, "^hg38::")) ;
data_frame(cell_id = y,
mouse_new_genes = mouse,
human_new_genes = human,
new_genes = mouse + human)
})
gene_mdata <- left_join(shared_genes,
not_recovered_genes,
by = "cell_id") %>%
left_join(., new_genes, by = "cell_id")
x <- add_metadata(x, gene_mdata)
x
})
resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library,
levels = c(
"original",
"standard",
"biotinylated",
"phosphorothioate")))genes_recovered <- resampled_metadat %>%
dplyr::filter(library != "original") %>%
dplyr::select(cell_id,
library,
resampled,
shared_genes,
not_recovered_genes,
new_genes)
genes_recovered <- gather(genes_recovered,
key = type, value = count,
-cell_id, -resampled, -library)
genes_recovered <- mutate(genes_recovered,
type = str_replace_all(type, "_", "\n"))
plt <- ggplot(genes_recovered,
aes(cell_id, count)) +
geom_point(aes(color = resampled),
size = 0.6,
alpha = 0.75) +
facet_grid(type ~ library) +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
strip.text.y = element_text(size = 12,
margin = margin(0,0.2,0,0.2, "cm"))) +
scale_color_manual(values = color_palette)
plt save_plot("new_genes_detected.pdf", plt, base_width = 8, base_height = 8)
targeted <- genes_recovered %>%
dplyr::filter(library == "phosphorothioate",
resampled)
targetedplt_dat <- genes_recovered %>%
dplyr::filter(library != "phosphorothioate",
resampled) %>%
group_by(type) %>%
summarize(count = mean(count)) %>%
mutate(cell_id = "Not Targeted Barcodes") %>%
bind_rows(targeted, .) %>%
filter(type == "new\ngenes")
plt <- ggplot(plt_dat,
aes(cell_id, count)) +
geom_bar(aes(fill = cell_id),
stat = "identity") +
labs(y = "Newly detected genes") +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "none") +
scale_fill_brewer(palette = "Set1")
save_plot("new_genes_barplot.pdf", plt,
base_width = 3.6, base_height = 5)
## species specific
genes_recovered <- resampled_metadat %>%
dplyr::filter(library == "phosphorothioate") %>%
select(cell_id, resampled, purity_reads, ends_with("genes")) %>%
mutate(species = ifelse(purity_reads > 0.80,
"human",
ifelse(purity_reads < 0.20,
"mouse",
"mixed"))) %>%
filter(species != "mixed") %>%
mutate(shared_genes_species = ifelse(species == "human",
human_shared_genes,
mouse_shared_genes),
new_genes_species = ifelse(species == "human",
human_new_genes,
mouse_new_genes),
not_recovered_genes_species = ifelse(species == "human",
human_not_recovered_genes,
mouse_not_recovered_genes)) %>%
select(cell_id, resampled, ends_with("_species")) %>%
gather(key = type, value = count,
-cell_id, -resampled) %>%
mutate(type = str_replace(type, "_species", "") %>%
str_replace_all(., "_", "\n"))
targeted <- genes_recovered %>%
dplyr::filter(resampled)
plt_dat <- genes_recovered %>%
dplyr::filter(!resampled) %>%
group_by(type) %>%
summarize(count = mean(count)) %>%
mutate(cell_id = "Not targeted\nbarcodes\n(mean)") %>%
bind_rows(targeted, .) %>%
mutate(type = ifelse(type == "new\ngenes",
"Newly\ndetected\ngenes",
ifelse(type == "shared\ngenes",
"Previously\ndetected\ngenes",
ifelse(type == "not\nrecovered\ngenes",
"Previously\ndetected\ngenes\nnot recovered",
NA))))
plt <- ggplot(plt_dat,
aes(cell_id, count)) +
geom_bar(aes(fill = type),
stat = "identity") +
labs(y = "# of Genes") +
scale_x_discrete(labels = cell_names[[resampled_lib]]) +
scale_y_continuous(labels = scales::comma) +
scale_fill_brewer(palette = "Set1", name = "") +
guides(fill = guide_legend(override.aes = list(size = 0.25))) +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "top",
plot.margin = unit(c(5.5, 50.5, 5.5, 5.5),
"points"))
# legend.key.size = unit(0.25, "pt"))
save_plot("new_genes_species_specific_barplot.pdf", plt,
base_width = 4, base_height = 5) calc_ma <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
x_rn <- rownames(xmat)
y_rn <- rownames(ymat)
xmat <- log2(xmat + 1)
ymat <- log2(ymat + 1)
rownames(xmat) <- x_rn
rownames(ymat) <- y_rn
m <- rowMeans(log2(((2^ymat + 2^xmat) / 2) + 1))
a <- xmat[, cell] - ymat[, cell]
data_frame(gene = names(a),
mean_expression_log2 = m,
log2_diff = a)
}
genes_to_plot <- rownames(sc_objs[[resampled_lib]]$umi_matrix)
cols <- colnames(sc_objs[[resampled_lib]]$norm_umi)
cell_ids <- cells[[resampled_lib]]
## append genes to reference library if necessary
ref_mat <- standardize_rows(sc_objs[[resampled_lib]]$umi_matrix[, cols],
sc_objs[[reflib]]$umi_matrix[, cols])
ma_dat <- map(cell_ids,
~calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot, cols],
ref_mat[genes_to_plot, cols],
cell = .x))
names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plot_ma <- function(df){
n_up <- filter(df, log2_diff > 0) %>%
group_by(cell) %>%
summarize(n = n(), n = paste0("up = ", n))
n_down <- filter(df, log2_diff < 0) %>%
group_by(cell) %>%
summarize(n = n(), n = paste0("down = ", n))
if (nrow(n_down) == 0) {
n_down = data_frame(cell = df$cell %>% unique(),
n = "down = 0")
}
plt <- ggplot(df,
aes(mean_expression_log2,
log2_diff)) +
geom_hline(aes(yintercept = 0), linetype = "dashed", colour = "grey") +
geom_point(size = 0.25) +
geom_text(data = n_up,
aes(x = max(ma_dat$mean_expression_log2) * 0.66,
y = max(ma_dat$log2_diff) * 1.2,
label = n)) +
geom_text(data = n_down,
aes(x = max(ma_dat$mean_expression_log2) * 0.66,
y = min(ma_dat$log2_diff) * 1.2,
label = n)) +
facet_wrap(~cell) +
labs(x = expression(paste("Abundance (log"[2], ")")),
y = expression(paste(frac("resampled","Original"), " (log"[2], ")")))
plt
}
plt <- plot_ma(ma_dat)
pltsave_plot("per_cell_MA_plot_all_genes.pdf", plt, base_height = 6)
## Shared genes
ma_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
pltsave_plot("per_cell_MA_plot_shared_genes.pdf", plt, base_height = 6)
## New genes
ma_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
pltsave_plot("per_cell_MA_plot_new_genes.pdf", plt, base_height = 6)get_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
xrows <- rownames(xmat)
xmat <- log2(xmat[, cell] + 1)
ymat <- log2(ymat[xrows, cell] + 1)
data_frame(
gene = xrows,
resampled = xmat,
original = ymat) %>%
gather(library,
Expression, -gene)
}
plot_histogram <- function(df){
ggplot(df,
aes_string("Expression")) +
geom_density(aes_string(fill = "library"),
alpha = 0.66) +
scale_fill_viridis_d(name = "") +
facet_wrap(~cell, nrow = 1) +
theme(legend.position = "top",
strip.text = element_text(size = 8))
}
expr_dat <- map(cell_ids,
function(x){
genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_histogram(expr_dat)
expressed_in_resampled_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_histogram(expr_dat)
share_genes_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- plot_histogram(expr_dat)
plts <- list(
expressed_in_resampled_plt,
share_genes_plt,
new_gene_plt
)
plt <- plot_grid(plotlist = plts, ncol = 1)
pltsave_plot("expression_histograms.pdf", plt,
ncol = 1, nrow = 3,
base_height = 4,
base_aspect_ratio = 2)get_paired_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
xrows <- rownames(xmat)
xmat <- log2(xmat[, cell] + 1)
ymat <- log2(ymat[xrows, cell] + 1)
data_frame(
gene = xrows,
resampled = xmat,
original = ymat)
}
plot_scatter <- function(df){
ggplot(df,
aes_string("original", "resampled")) +
geom_point(size = 0.5) +
geom_abline(aes(slope = 1, intercept = 0)) +
facet_wrap(~cell, nrow = 1) +
expand_limits(x = 0, y = 0) +
coord_fixed() +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
theme(legend.position = "top",
strip.text = element_text(size = 8))
}
expr_dat <- map(cell_ids,
function(x){
genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_scatter(expr_dat)
expressed_in_resampled_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_scatter(expr_dat)
share_genes_pltexpr_dat <- map(cell_ids,
function(x){
genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
cols],
ref_mat[genes_to_plot, cols],
cell = x)
})
names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- ggplot(expr_dat, aes(cell,resampled)) +
geom_jitter(alpha = 0.55) +
geom_violin(aes(fill = cell)) +
ylim(0, max(expr_dat$resampled) * 1.10) +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0.5),
legend.pos = "none",
axis.title.x = element_blank())
new_gene_pltplts <- list(
expressed_in_resampled_plt,
share_genes_plt
)
plt <- plot_grid(plotlist = plts, ncol = 1)
pltsave_plot("expression_scatterplots.pdf", plt,
ncol = 1, nrow = 3,
base_height = 4,
base_aspect_ratio = 2)
save_plot("expression_newgenes_violinplots.pdf", new_gene_plt,
base_height = 6,
base_aspect_ratio = 0.5)compare_umis <- function(path_to_ctrl,
path_to_test,
return_summary = F){
## umi seqs should be produced by ./get_molecule_info
ctrl_umi_seqs <- read_tsv(path_to_ctrl,
col_names = c("barcode_10x",
"umi_molecule",
"count")) %>%
filter(barcode_10x != "Cell_unmatched")
test_umi_seqs <- read_tsv(path_to_test,
col_names = c("barcode_10x",
"umi_molecule",
"count")) %>%
filter(barcode_10x != "Cell_unmatched")
umi_seqs <- full_join(ctrl_umi_seqs,
test_umi_seqs,
by = c("barcode_10x", "umi_molecule"))
if (return_summary) {
umi_seqs %>%
mutate(new_umi = ifelse(is.na(count.x) & !is.na(count.y),
1L,
0L),
not_detected_umi = ifelse(!is.na(count.x) & is.na(count.y),
1L,
0L),
shared_umi = ifelse(!is.na(count.x) & !is.na(count.y),
1L,
0L)) %>%
group_by(barcode_10x) %>%
summarize(new_umis = sum(new_umi),
not_detected_umis = sum(not_detected_umi),
shared_umis = sum(shared_umi))
} else {
umi_seqs
}
}
umi_files <- file.path(data_dir, libs, "umis", "umigroups.txt.gz")
umi_summaries <- map(umi_files[2:4],
~compare_umis(umi_files[1], .x, return_summary = T))
names(umi_summaries) <- umi_files[2:4] %>%
str_split(., "/") %>%
map_chr(~.x[7])
umi_summary <- bind_rows(umi_summaries, .id = "library")
umis_recovered <- umi_summary %>%
gather(class, count, -barcode_10x, -library)
## annotate with resampled or not
libs <- c("standard", "biotinylated","phosphorothioate")
cell_annot <- data_frame(barcode_10x = cells,
library = libs,
resampled = T) %>%
unnest()
umis_recovered <- umis_recovered %>%
left_join(., cell_annot, by = c("library", "barcode_10x")) %>%
mutate(resampled = ifelse(is.na(resampled),
F,
resampled),
library = factor(library, levels = libs)) %>%
arrange(resampled)
plt <- ggplot(umis_recovered,
aes(barcode_10x, count)) +
geom_point(aes(colour = resampled),
size = 0.6,
alpha = 0.75) +
facet_grid(library ~ class) +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
strip.text.y = element_text(size = 12,
margin = margin(0,0.2,0,0.2, "cm"))) +
scale_color_manual(values = color_palette)
plt umi_seqs <- map(umi_files[2:4],
~compare_umis(umi_files[1], .x, return_summary = F))
names(umi_seqs) <- umi_files[2:4] %>%
str_split(., "/") %>%
map_chr(~.x[7])
new_umis <- map2(umi_seqs,
split(cell_annot, cell_annot$library)[libs],
~filter(.x,
barcode_10x %in% .y$barcode_10x,
!is.na(count.y),
is.na(count.x)) %>%
separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>%
select(-starts_with("count")))
old_umis <- map2(umi_seqs,
split(cell_annot, cell_annot$library)[libs],
~filter(.x,
barcode_10x %in% .y$barcode_10x,
!is.na(count.x),
!is.na(count.y)) %>%
separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>%
select(-starts_with("count")))
umi_edit_dist <- map2(new_umis,
old_umis,
~left_join(.x, .y,
by = c("barcode_10x", "genome", "gene")) %>%
na.omit() %>%
mutate(ed = kentr::get_hamming_pairs(seq.x, seq.y)) %>%
group_by(barcode_10x, seq.y, genome, gene) %>%
summarize(min_ed = as.integer(min(ed))) %>%
ungroup())
umi_edit_dist <- bind_rows(umi_edit_dist,
.id = "library")
cell_names_df <- map(cell_names,
~data_frame(old_id = names(.x),
new_id = .x)) %>%
bind_rows(., .id = "library")
umi_edit_dist_plt <- left_join(umi_edit_dist,
cell_names_df,
by = c("barcode_10x" = "old_id",
"library")) %>%
rename(cell_id = new_id)
plt <- ggplot(umi_edit_dist_plt, aes(cell_id,
min_ed)) +
geom_boxplot(coef = Inf) +
facet_wrap(~library, scales = "free_x",
labeller = labeller(library = lib_names)) +
scale_y_continuous(breaks= scales::pretty_breaks()) +
labs(y = "Minimum edit distance\noriginal vs. new UMIs") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5),
axis.title.x = element_blank())
save_plot("umi_edit_distance.pdf", plt,
base_width = 8)library(Seurat)
mat <- sc_objs$original$umi_matrix
human_cell_ids <- sc_objs$original$meta_dat %>%
filter(og_purity_reads > 0.80) %>%
filter(str_detect(cell_id, "1Tumor|2LukGFP|3Brain|4Bone"),
!str_detect(cell_id, "2LukGFPDead")) %>%
pull(cell_id)
mat <- mat[, human_cell_ids]
mat <- mat[str_detect(rownames(mat), "^hg38::"), ]
sobj <- CreateSeuratObject(mat, names.field = 2, names.delim = "_")
new_ids <- sobj@meta.data %>%
rownames_to_column("cell") %>%
mutate(new_id = str_sub(orig.ident, 2),
new_id = ifelse(new_id == "LukGFP",
"Primary\ncells",
new_id),
new_id = ifelse(new_id == "Tumor",
"Primary\ntumor",
new_id)) %>%
select(cell, new_id) %>%
as.data.frame(.) %>%
column_to_rownames("cell")
sobj <- AddMetaData(sobj, new_ids)
sobj <- SetAllIdent(sobj, "new_id")
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj, vars.to.regress = "nUMI")
sobj <- FindVariableGenes(sobj, do.plot = F)
sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data), pcs.compute = 5, do.print = F)
sobj <- RunTSNE(sobj, dims.use = 1:5, seed.use = 20180427)sobj <- FindClusters(sobj,
dims.use = 1:5, save.SNN = T, print.output = F)cell_percents <- group_by(sobj@meta.data, new_id) %>%
summarize(n_cells = n()) %>%
mutate(total_cells = sum(n_cells),
percentage = 100 * (n_cells / total_cells)) %>%
select(-total_cells)
cell_percentstsne_dat <- GetCellEmbeddings(sobj, "tsne") %>%
as.data.frame() %>%
tibble::rownames_to_column("cell") %>%
left_join(sobj@meta.data %>%
as.data.frame() %>%
tibble::rownames_to_column("cell"),
., by = "cell") %>%
left_join(., cell_percents, by = "new_id") %>%
mutate(cell_labels_annotated = str_c(new_id,
" (",
signif(percentage, 3),
"%)"))
tsne_plt <- ggplot(tsne_dat,
aes(tSNE_1, tSNE_2)) +
geom_point(aes(color = cell_labels_annotated), size = 1) +
scale_color_brewer(palette = "Set1",
name = "") +
labs(title = "",
x = "tSNE 1",
y = "tSNE 2") +
theme_cowplot() +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.pos = "right",
aspect.ratio = 1,
legend.key.size = unit(2, 'lines'))
save_plot("original_tsne.pdf", tsne_plt,
base_width = 5.5,
base_aspect_ratio = 1.5)mat <- sc_objs$original$umi_matrix
human_cell_ids <- sc_objs$original$meta_dat %>%
filter(str_detect(cell_id, "1Tumor|2LukGFP|3Brain|4Bone"),
!str_detect(cell_id, "2LukGFPDead")) %>%
pull(cell_id)
mat <- mat[, human_cell_ids]
resampled_ids <- sc_objs$phosphorothioate$meta_dat %>%
filter(resampled) %>%
pull(cell_id)
resampled_mat <- sc_objs$phosphorothioate$umi_matrix[, resampled_ids]
colnames(resampled_mat) <- str_c(colnames(resampled_mat), "::", "resampled")
mat <- as.data.frame(as.matrix(mat)) %>% rownames_to_column("cell")
resampled_mat <- as.data.frame(as.matrix(resampled_mat)) %>% rownames_to_column("cell")
combined_mats <- left_join(mat, resampled_mat, by = c("cell"))
combined_mats <- as.data.frame(combined_mats) %>%
column_to_rownames("cell") %>%
as.matrix() %>%
as(., "sparseMatrix")
combined_mats[is.na(combined_mats)] <- 0
sobj <- CreateSeuratObject(combined_mats, names.field = 2, names.delim = "_")
new_ids <- sobj@meta.data %>%
rownames_to_column("cell") %>%
mutate(new_id = str_sub(orig.ident, 2),
new_id = ifelse(str_detect(new_id, "LukGFP"),
"Primary\ncells",
new_id),
new_id = ifelse(str_detect(new_id, "Tumor"),
"Primary\ntumor",
new_id),
resampled = ifelse(str_detect(cell, "resampled"),
"resampled",
"not resampled"))
resampled_cell_ids <- new_ids[new_ids$resampled == "resampled", "cell"] %>%
str_replace("::resampled", "")
## add in computed purity and other values
mdata_og <- sc_objs$phosphorothioate$meta_dat %>%
filter(cell_id %in% rownames(sobj@meta.data))
resampled_mdata <- filter(mdata_og, resampled) %>%
select(cell_id, og_purity_reads = purity_reads) %>%
mutate(cell_id = str_c(cell_id, "::resampled"))
mdata_og <- mdata_og %>%
select(cell_id, og_purity_reads) %>%
bind_rows(., resampled_mdata) %>%
as.data.frame() %>%
column_to_rownames("cell_id")
new_ids <- mutate(new_ids,
resampled = ifelse(cell %in% resampled_cell_ids,
"original cell",
resampled)) %>%
select(cell, new_id, resampled) %>%
as.data.frame(.) %>%
column_to_rownames("cell")
sobj <- AddMetaData(sobj, new_ids)
sobj <- AddMetaData(sobj, mdata_og)
sobj <- SetAllIdent(sobj, "new_id")
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj, vars.to.regress = "nUMI")
sobj <- FindVariableGenes(sobj)sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data))
sobj <- RunTSNE(sobj, dims.use = 1:5, seed.use = 20180502)
tsne_plt <- TSNEPlot(sobj, colors.use = brewer.pal(7, "Set1")) +
labs(title = "Breast Cancer PDX") +
theme(legend.position = "bottom")resampled_cells <- TSNEPlot(SetAllIdent(sobj,
"resampled"),
colors.use = c("lightgrey",
brewer.pal(7, "Set1")[1:2])) +
labs(title = "Breast Cancer PDX") +
theme(legend.position = "bottom")tmp <- resampled_cells$data
og_cell_dat <- tmp[cells$phosphoro, ] %>%
rownames_to_column("cell")
resampled_cell_dat <- tmp[str_c(cells$phosphoro, "::resampled"), ] %>%
rownames_to_column("cell") %>%
mutate(cell = str_replace(cell, "::resampled", "")) %>%
select(cell, tSNE_1, tSNE_2)
cell_dat <- left_join(og_cell_dat,
resampled_cell_dat,
by = "cell",
suffix = c("", "_resampled"))
resampled_cells <- TSNEPlot(SetAllIdent(sobj,
"resampled"),
colors.use = c("lightgrey",
brewer.pal(7, "Set1")[1:2])) +
geom_segment(data = cell_dat,
aes(x = tSNE_1,
y = tSNE_2,
xend = tSNE_1_resampled,
yend = tSNE_2_resampled),
linejoin = "mitre",
arrow = arrow(length = unit(0.03, "npc"))) +
labs(title = "Breast Cancer PDX") +
theme(legend.position = "bottom")resampled_cells_purity <- FeaturePlot(sobj,
features.plot = "og_purity_reads",
no.legend = F,
do.return = T)$og_purity_reads +
geom_segment(data = cell_dat,
aes(x = tSNE_1,
y = tSNE_2,
xend = tSNE_1_resampled,
yend = tSNE_2_resampled),
linejoin = "mitre",
arrow = arrow(length = unit(0.01, "npc"))) +
labs(title = "Breast Cancer PDX") +
theme(legend.position = "bottom")plt <- plot_grid(tsne_plt,
resampled_cells,
resampled_cells_purity,
nrow = 1)
save_plot("resampled_tsne.pdf", plt,
nrow = 1, ncol = 3,
base_height = 5.5, base_width = 4.25)Find the k-nearest neighboors in PCA and gene expression space
## use combined data from above
data.use <- GetCellEmbeddings(object = sobj,
reduction.type = "pca",
dims.use = 1:20)
## find top 10 nearest neighboors using exact search
knn <- RANN::nn2(data.use, k = 5,
searchtype = 'standard',
eps = 0)
resampled_idxs <- knn$nn.idx[str_detect(rownames(data.use), "::resampled"), ]
nn_ids <- as_data_frame(t(apply(resampled_idxs, 1,
function(x)rownames(data.use)[x])))
colnames(nn_ids) <- c("query_cell", paste0("nearest neighbor ", 1:(ncol(nn_ids) - 1)))
nn_idssc_metadat <- map(sc_objs["original"], ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
filter(str_detect(cell_id, "1Tumor|2LukGFP|3Brain|4Bone"),
!str_detect(cell_id, "2LukGFPDead")) %>%
mutate(cell_type = str_split(cell_id, "_", simplify = T)[, 2],
cell_type = str_sub(cell_type, 2),
cell_type = ifelse(cell_type == "LukGFP",
"Primary\ncells",
cell_type),
cell_type = ifelse(cell_type == "Tumor",
"Primary\ntumor",
cell_type))
sc_metadat <- arrange(sc_metadat, resampled)
a <- ggplot(sc_metadat, aes(cell_id, total_reads)) +
geom_point(aes(color = resampled), pt.size = 0.75) +
labs(x = "Cell",
y = "Sequencing Depth") +
scale_y_log10(labels = scales::comma, breaks = c(1e3, 1e4,1e5,1e6)) +
scale_color_manual(values = color_palette) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "top")
b <- ggplot(sc_metadat, aes(cell_id, total_reads)) +
geom_tile(aes(cell_id,
y = -.1,
height = 0.05 , fill = cell_type)) +
scale_fill_brewer(palette = "Set1", name = "Cell Type") +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.line = element_blank(),
legend.position = "bottom",
plot.margin=unit(c(0,0,0,0), "lines")
)
g1 <- ggplotGrob(a)
g2 <- ggplotGrob(b)
maxWidth = grid::unit.pmax(g1$widths, g2$widths)
g1$widths <- as.list(maxWidth)
g2$widths <- as.list(maxWidth)
grid::grid.newpage()
plt <- gridExtra::grid.arrange(gridExtra::arrangeGrob(g1, g2, nrow = 2 , heights=c(.9, .175)))grid::grid.newpage()
save_plot("selected_cells.pdf", plt, base_height = 5)resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>%
bind_rows(.id = "library") %>%
mutate(library = factor(library,
levels = unlist(libs))) %>%
arrange(resampled)
cell_names_df <- map(cell_names,
~data_frame(old_id = names(.x),
new_id = .x)) %>%
bind_rows(., .id = "library")
plt_dat <- resampled_metadat %>%
filter(resampled, library != reflib) %>%
left_join(.,
cell_names_df, by = c("cell_id" = "old_id",
"library")) %>%
select(new_id, library, contains("purity")) %>%
gather(type, purity, -new_id, -library) %>%
mutate(type = ifelse(str_detect(type, "^og"),
str_replace(type, "^og_", "Original_"),
str_c("Resampled_", type)),
library = factor(library, levels = resampled_libs)) %>%
separate(type, c("resampled", "not_import", "count_type"), sep = "_") %>%
select(-not_import)
plt_dat <- mutate(plt_dat,
count_type = ifelse(count_type == "reads",
"Reads",
"UMIs"))
plt <- ggplot(plt_dat, aes(new_id, purity)) +
geom_hline(aes(yintercept = 0.80), linetype = "dashed",
color = "grey", alpha = 0.75) +
geom_hline(aes(yintercept = 0.20), linetype = "dashed",
color = "grey", alpha = 0.75) +
geom_point(aes(color = resampled),
position = position_dodge(width = 0.5)) +
facet_wrap(count_type~library,
labeller = labeller(library = lib_names),
scales = "free_x", nrow = 1) +
scale_color_brewer(palette = "Set1", name = "") +
guides(color = guide_legend(override.aes = list(size = 5))) +
labs(x = "",
y = expression(paste( "Species Purity ",
frac("Human", "Human + Mouse")))) +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5),
legend.pos = "top")
save_plot("species_specificity.pdf", plt,
base_width = 12, base_height= 6.5)